# -*- coding=utf-8 -*-
# 2021.
# @zlk 
# zhanglk960127@163.com
# 提取基因间区的序列
from Bio import SeqIO
from Bio.Alphabet import IUPAC
import sys

file1=open(sys.argv[1],'r') # pos_file
out_file=open(sys.argv[3],'w')
out_file2=open(sys.argv[3]+'.bed','w')

gene_pos_dcit={}
for line in file1:
    line_list=line.strip().split('\t')
    if line_list[3] not in gene_pos_dcit.keys():
        gene_pos_dcit[line_list[3]]=[]
    pos_list=[]
    for i in line_list[4:]:
        pos_list.append(int(i.split(':')[0]))
        pos_list.append(int(i.split(':')[1]))
    gene_start=sorted(pos_list)[0]
    gene_end=sorted(pos_list)[1]
    gene_pos_dcit[line_list[3]].append([gene_start,gene_end])

inter_region_dict={}
for key,value in gene_pos_dcit.items():
    if len(value)<100:
        continue
    inter_region_dict[key]=[]
    sort_value=sorted(value,key=lambda x:x[0])
    last_end=0
    for i_list in sort_value:
        inter_region_dict[key].append([last_end,i_list[0]])
        last_end=i_list[1]
fasta_file = SeqIO.to_dict(SeqIO.parse(sys.argv[2], "fasta", IUPAC.unambiguous_dna)) 
index=0
for key,value in inter_region_dict.items():
    for pos_list in value:
        if pos_list[1]-pos_list[0]<100:
            continue
        index+=1
        out_str=str(fasta_file[key][pos_list[0]:pos_list[1]].seq)
        out_file.write('>inter_region'+str(index)+'\n'+out_str+'\n')
        out_file2.write(key+'\t'+str(pos_list[0])+'\t'+str(pos_list[1])+'\tinter_region'+str(index)+'\n')
    
         